import pytesseract
from pdf2image import convert_from_path
import os

# 设置路径（根据你的环境修改）
poppler_path = r"E:\poppler\poppler\poppler-24.08.0\Library\bin"  # Windows需要，Mac/Linux可删
pytesseract.pytesseract.tesseract_cmd = r"E:\TesseractOcr\tesseract.exe"  # Tesseract路径


def pdf_to_text(pdf_path, output_dir="output"):
    """

    :rtype: object
    """
    # 创建输出目录
    os.makedirs(output_dir, exist_ok=True)

    # 将PDF转换为图像列表（每页一张图）
    images = convert_from_path(
        pdf_path,
        poppler_path=poppler_path,  # Windows需要，其他系统删除此行
        dpi=300  # 提高分辨率以增强识别精度
    )

    # 对每张图像进行OCR
    all_text = []
    for i, image in enumerate(images):
        # 识别文本
        text = pytesseract.image_to_string(image, lang='chi_sim+eng')  # 中英文识别
        all_text.append(text)

        # # 可选：保存每页的文本
        # with open(os.path.join(output_dir, f"page_{i + 1}.txt"), "w", encoding="utf-8") as f:
        #     f.write(text)

    # 合并所有文本
    full_text = "\n\n".join(all_text)
    # with open(os.path.join(output_dir, "full_text.txt"), "w", encoding="utf-8") as f:
    #     f.write(full_text)

    return full_text


# 使用示例
pdf_path = "附件3/B3108.pdf"
extracted_text = pdf_to_text(pdf_path)
print("识别完成！文本已保存至 output/ 目录。")
print(extracted_text)